Project Part 2

Group 9

Authors
Affiliation

Thimoté Dupuch

University of Twente

Joris van Lierop

University of Twente

Jurre van Sijpveld

University of Twente

Published

September 27, 2024

Loading libraries

library(dplyr)
library(forcats)
library(ggplot2)
library(plotly)
library(broom)
library(boot)
library(caret)
library(pROC)
library(knitr)
library(MASS)
library(data.table)

Loading dataset

# dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing values
dataset <- fread("SMARTc.csv", sep = ";") # Much faster

Re-encode the categorical variables

dataset <- mutate(dataset,
    EVENT = factor(EVENT),
    EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
    SEX = factor(SEX),
    SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
    DIABETES = factor(DIABETES),
    DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
    SMOKING = factor(SMOKING),
    SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
    alcohol = factor(alcohol),
    alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
    CEREBRAL = factor(CEREBRAL),
    CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
    CARDIAC = factor(CARDIAC),
    CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
    AAA = factor(AAA),
    AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
    PERIPH = factor(PERIPH),
    PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
    albumin = factor(albumin),
    albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
    STENOSIS = factor(STENOSIS),
    STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)

Logistic regression model of EVENT

Accessing performance using cross-validation

k <- 10
set.seed(123)
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
roc_list <- list()
auc_values <- numeric(k)

for (i in 1:k) {
    train <- dataset[-folds[[i]], ]
    test <- dataset[folds[[i]], ]

    fit_train <- glm(EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
        HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
        alcohol + albumin, data = train, family = "binomial")

    predict_test <- predict(fit_train, newdata = test, type = "response")
    roc_i <- roc(test$EVENT, predict_test)
    roc_list[[i]] <- roc_i
    auc_values[i] <- auc(roc_i)
}
display code to plot the ROC curves
roc_df <- do.call(rbind, lapply(1:length(roc_list), function(i) {
    data.frame(
        Fold = paste("Fold", i),
        Sensitivity = roc_list[[i]]$sensitivities,
        Specificity = 1 - roc_list[[i]]$specificities
    )
}))

roc_plot <- ggplot(roc_df, aes(x = Specificity, y = Sensitivity, color = Fold)) +
    geom_line(linewidth = 0.8) +
    scale_color_brewer(palette = "Set3") +
    theme_minimal(base_size = 14) +
    labs(
        title = "ROC Curves for Each Fold",
        x = "1 - Specificity",
        y = "Sensitivity",
        color = "Fold"
    ) +
    theme(
        plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
        legend.position = "bottom"
    ) +
    coord_equal()

ggplotly(roc_plot)
auc_table <- data.frame(Iteration = 1:k, AUC = auc_values)
mean_auc <- mean(auc_values)
auc_table$AUC <- round(auc_table$AUC, 4)
mean_auc <- round(mean_auc, 4)
auc_table <- rbind(auc_table, c("AUC mean", mean_auc))
kable(auc_table, caption = "AUC Values for 10-Fold Cross-Validation")
AUC Values for 10-Fold Cross-Validation
Iteration AUC
1 0.7331
2 0.6542
3 0.7895
4 0.745
5 0.6944
6 0.7877
7 0.7776
8 0.7044
9 0.7471
10 0.7574
AUC mean 0.7391
(auc_confidence_interval = t.test(auc_values)$conf.int)
[1] 0.7077229 0.7703818
attr(,"conf.level")
[1] 0.95

This means that we can be 95% confident that the true AUC value lies between 0.7077 and 0.7704.

Calibration model

Assessing the calibration using a calibration plot

# df_copy <- mutate(dataset,
#     EVENT = fct_recode(EVENT, "0" = "no", "1" = "yes"),
# )

observed_outcomes <- as.numeric(test$EVENT) - 1
calibration_data <- data.frame(Predicted = predict_test, Observed = observed_outcomes)
display code to plot the calibration plot
calibration_plot <- ggplot(calibration_data, aes(x = Predicted, y = Observed)) +
    geom_smooth(method = "loess", se = FALSE, formula = y ~ x, color = "#3ca7c7") +
    geom_point(size = 2, alpha = 0.5) +
    geom_abline(intercept = 0, slope = 1, linetype = "dashed",color = "#ff8800") +
    labs(
        x = "Predicted Probability",
        y = "Observed Probability",
        title = "Calibration Plot"
    ) +
    theme_minimal(base_size = 14) +
    theme(plot.title = element_text(hjust = 0.5, size = 15, face = "bold"))

ggplotly(calibration_plot)

The orange dashed line is the ideal calibration line, where the predicted probabilities perfectly match the observed outcomes. The blue curve shows the actual calibration of the model — how the predictions from the model align with the observed data.

For the low predicted probabilities (0.0 to 0.1), the blue curve is relatively flat and near the dashed line, indicating that the model is well-calibrated for low-probability predictions. For the moderate predicted probabilities (0.1 to 0.3), the blue curve deviates from the orange dashed line and rises faster than the ideal line. This suggests overprediction in this range : the model tends to predict higher probabilities than are observed in reality. For higher predicted probabilities (above 0.3), the blue curve is consistently above the orange line, indicating a general overestimation of risk. This means that when the model predicts a higher probability of an event, the actual frequency is lower than what is predicted. This overestimation can lead to false positives, where the model predicts events that don’t actually occur as often as expected.

A well calibrated model is useful because it provides accurate probabilities that can be used to make informed decisions. If a model is poorly calibrated, the predicted probabilities may not accurately reflect the true likelihood of an event, which can lead to suboptimal decisions. In this case, the model tends to overestimate the risk of having a cardiovascular event, which could lead to unnecessary interventions or treatments.

Performance comparison between the logistic regression model and a LDA model

k <- 10
folds <- createFolds(dataset$EVENT, k = k, list = TRUE, returnTrain = FALSE)
auc_values_glm <- numeric(k)
auc_values_lda <- numeric(k)
brier_scores_glm <- numeric(k)
brier_scores_lda <- numeric(k)

fit_formula <- EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
    HISTCAR2 + HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
    alcohol + albumin


for (i in 1:k) {
    train <- dataset[-folds[[i]], ]
    test <- dataset[folds[[i]], ]

    fit_train <- glm(fit_formula, data = train, family = "binomial")
    lda_train <- lda(fit_formula, data = train)

    predict_test_glm <- predict(fit_train, newdata = test, type = "response")
    predict_test_lda <- predict(lda_train, newdata = test)$posterior[, 2]

    roc_glm <- roc(test$EVENT, predict_test_glm)
    roc_lda <- roc(test$EVENT, predict_test_lda)

    auc_glm <- auc(roc_glm)
    auc_lda <- auc(roc_lda)

    auc_values_glm[i] <- auc_glm
    auc_values_lda[i] <- auc_lda
    
    test$EVENT_numeric <- ifelse(test$EVENT == "yes", 1, 0)

    brier_scores_glm[i] <- mean((predict_test_glm - test$EVENT_numeric)^2)
    brier_scores_lda[i] <- mean((predict_test_lda - test$EVENT_numeric)^2)
}
auc_table <- data.frame(
    Iteration = 1:k,
    AUC_GLM = auc_values_glm,
    AUC_LDA = auc_values_lda
)
mean_auc_glm <- mean(auc_values_glm)
mean_auc_lda <- mean(auc_values_lda)
auc_table <- rbind(auc_table, c("AUC mean", mean_auc_glm, mean_auc_lda))
kable(auc_table, caption = "AUC Values for 10-Fold Cross-Validation")
AUC Values for 10-Fold Cross-Validation
Iteration AUC_GLM AUC_LDA
1 0.689213311232947 0.680606910620936
2 0.694632156062731 0.681308172893026
3 0.757618258319521 0.746716817544307
4 0.787836287135025 0.786816269284712
5 0.830854309687262 0.833015509788965
6 0.754876960346806 0.761379574142548
7 0.733329083258957 0.736261634578605
8 0.755960729312763 0.760550809639169
9 0.670099160945843 0.672260361047546
10 0.720760233918129 0.719171116196288
AUC mean 0.739518049021998 0.73780871757361
brier_scores_table <- data.frame(
    Iteration = 1:k,
    Brier_Score_GLM = brier_scores_glm,
    Brier_Score_LDA = brier_scores_lda
)

mean_brier_score_glm <- mean(brier_scores_glm)
mean_brier_score_lda <- mean(brier_scores_lda)
brier_scores_table <- rbind(
    brier_scores_table,
    c("Brier Score mean", mean_brier_score_glm, mean_brier_score_lda)
)

kable(brier_scores_table, caption = "Brier Scores for 10-Fold Cross-Validation")
Brier Scores for 10-Fold Cross-Validation
Iteration Brier_Score_GLM Brier_Score_LDA
1 0.0980693094964066 0.101151527699879
2 0.0957994926276176 0.0992645798727123
3 0.0937325644094975 0.0975508816258736
4 0.0934333213044997 0.0949895733456367
5 0.0856124284059258 0.0841449827380715
6 0.0953253710852003 0.0956756178111479
7 0.0930190550803127 0.0935793549109133
8 0.0965181691367765 0.0976024186782206
9 0.0965950257935387 0.0975672443459604
10 0.0980470551266242 0.101430931284722
Brier Score mean 0.09461517924664 0.0962957112313137
(auc_confidence_interval_glm <- t.test(auc_values_glm)$conf.int)
[1] 0.7047454 0.7742907
attr(,"conf.level")
[1] 0.95
(auc_confidence_interval_lda <- t.test(auc_values_lda)$conf.int)
[1] 0.7010771 0.7745404
attr(,"conf.level")
[1] 0.95

This means that we can be 95% confident that the true AUC value for the logistic regression model lies between 0.7047 and 0.7743, and for the LDA model between 0.7011 and 0.7745.

(brier_score_confidence_interval_glm <- t.test(brier_scores_glm)$conf.int)
[1] 0.09201474 0.09721562
attr(,"conf.level")
[1] 0.95
(brier_score_confidence_interval_lda <- t.test(brier_scores_lda)$conf.int)
[1] 0.09275269 0.09983874
attr(,"conf.level")
[1] 0.95

This means that we can be 95% confident that the true Brier score for the logistic regression model lies between 0.0920 and 0.0972, and for the LDA model between 0.0928 and 0.0998.

Conclusion

The logistic regression model and the LDA model have similar performance in terms of AUC and Brier score. The logistic regression model has a slightly lower Brier score, indicating better calibration, but the difference is small. Both models have similar AUC values, indicating similar discrimination performance.